library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.2
wines <- read.csv("/Users/oyunm/Desktop/SMU/DS-6306-Doing-the-Data-Science/Project 2/Wine Train Set.csv")
head(wines)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.2 0.34 0.34 12.6 0.048
## 2 2 6.0 0.27 0.28 4.8 0.063
## 3 3 6.9 0.26 0.49 1.6 0.058
## 4 4 6.6 0.25 0.34 3.0 0.054
## 5 5 7.1 0.17 0.43 1.3 0.023
## 6 6 6.0 0.29 0.25 1.4 0.033
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 7 41 0.99420 3.19 0.40 11.7 white
## 2 31 201 0.99640 3.69 0.71 10.0 white
## 3 39 166 0.99650 3.65 0.52 9.4 white
## 4 22 141 0.99338 3.26 0.47 10.4 white
## 5 33 132 0.99067 3.11 0.56 11.7 white
## 6 30 114 0.98794 3.08 0.43 13.2 white
## location quality
## 1 Texas 5
## 2 Texas 5
## 3 Texas 4
## 4 California 6
## 5 California 6
## 6 California 6
There are all 14 variables, except for ID, where there are three categorical variables(quality, location, type) and the remaining 11 variables are numeric
summary(wines)
## ID fixed.acidity volatile.acidity citric.acid
## Min. : 1 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:1366 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median :2732 Median : 7.000 Median :0.2900 Median :0.3100
## Mean :2732 Mean : 7.218 Mean :0.3382 Mean :0.3185
## 3rd Qu.:4098 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :5463 Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.60 Min. :0.00900 Min. : 1.00 Min. : 6.0
## 1st Qu.: 1.80 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0
## Median : 3.00 Median :0.04700 Median : 29.00 Median :118.0
## Mean : 5.42 Mean :0.05613 Mean : 30.58 Mean :115.9
## 3rd Qu.: 8.10 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:155.0
## Max. :31.60 Max. :0.61100 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.0
## 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5
## Median :0.9949 Median :3.210 Median :0.5100 Median :10.3
## Mean :0.9947 Mean :3.217 Mean :0.5318 Mean :10.5
## 3rd Qu.:0.9969 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.3
## Max. :1.0103 Max. :4.010 Max. :2.0000 Max. :14.9
## type location quality
## Length:5463 Length:5463 Min. :3.000
## Class :character Class :character 1st Qu.:5.000
## Mode :character Mode :character Median :6.000
## Mean :5.823
## 3rd Qu.:6.000
## Max. :9.000
str(wines)
## 'data.frame': 5463 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
## $ volatile.acidity : num 0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
## $ citric.acid : num 0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
## $ residual.sugar : num 12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
## $ chlorides : num 0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
## $ free.sulfur.dioxide : num 7 31 39 22 33 30 9 23 15 47 ...
## $ total.sulfur.dioxide: num 41 201 166 141 132 114 27 97 151 194 ...
## $ density : num 0.994 0.996 0.997 0.993 0.991 ...
## $ pH : num 3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
## $ sulphates : num 0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
## $ alcohol : num 11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
## $ type : chr "white" "white" "white" "white" ...
## $ location : chr "Texas" "Texas" "Texas" "California" ...
## $ quality : int 5 5 4 6 6 6 6 5 5 6 ...
Fixed Acidity measures the natural acids in wine that contribute to its structure, flavor, and color. Only after quality eight is the less fixed. Acidity is more quality, but for quality 9, that was not the case.
## ensure quality as a factor
wines$quality <- as.factor(wines$quality)
wines %>% ggplot(aes(x=quality, y=fixed.acidity, color=quality)) + geom_boxplot()
Volatile Acidity evaporates quickly and contributes to a wine’s aroma—the less volatile the acidity, the higher the quality.
wines %>% ggplot(aes(x=quality, y=volatile.acidity, color=quality)) + geom_boxplot()
Citric Acid can be used for acidification in wines that are naturally lacking in Acid. It adds liveliness and freshness to the wine, bringing a fresher, fruity citrus note. The more citric Acid there is, the higher the quality.
wines %>% ggplot(aes(x=quality, y=citric.acid, color=quality)) + geom_boxplot()
Residual sugar in wine is from natural grape sugars left in a wine after the alcoholic fermentation finishes. The medians throughout the quality doesn’t show much difference.
wines %>% ggplot(aes(x=quality, y=residual.sugar, color=quality)) + geom_boxplot()
The amount of chloride in wine is influenced by the type of grape, the vineyard’s location, and surrounding soil and water conditions. According to research, a high level of wine can lead to a decrease in its market appeal. The boxplots display that lower chlorides have better quality.
wines %>% ggplot(aes(x=quality, y=chlorides, color=quality)) + geom_boxplot()
The amount of sulfur dioxide in wine is available to protect it from oxidation. It inhibits the growth of microorganisms. The more there is free sulfur dioxide, the better quality.
wines %>% ggplot(aes(x=quality, y=free.sulfur.dioxide, color=quality)) + geom_boxplot()
TSP2 is the amount of sulfur dioxide that is free in the wine and also the amount that is bound to other chemicals in the wine. It is also a preservative used in wine-making to prevent oxidation and spoilage and to maintain freshness. The higher the total sulfur dioxide, the better the quality.
wines %>% ggplot(aes(x=quality, y=total.sulfur.dioxide, color=quality)) + geom_boxplot()
Density is about concentration and a fuller body. Better quality wines displayed lesser density.
wines %>% ggplot(aes(x=quality, y=density, color=quality)) + geom_boxplot()
Lower pH makes the wine more stable and protects it against bacteria. The median distribution of quality shows a curve pattern. It shows a decreasing trend from quality 3 to 5, but after quality 5, the quality is better as pH increases. However, since quality 3 and quality 9 have similar pH median.
wines %>% ggplot(aes(x=quality, y=pH, color=quality)) + geom_boxplot()
Sulfates are a group of chemical compounds, including sulfur dioxide. Yeast produces sulfites during fermentation. However, there are some differences in distribution. The median of each quality doesn’t show the big difference.
wines %>% ggplot(aes(x=quality, y=sulphates, color=quality)) + geom_boxplot()
There is a curve pattern. Until wine quality 5, the alcohol decreases, but from quality 6, it increases.
wines %>% ggplot(aes(x=quality, y=alcohol, color=quality)) + geom_boxplot()
table(wines$quality, wines$type)
##
## red white
## 3 9 16
## 4 44 137
## 5 554 1225
## 6 536 1861
## 7 181 738
## 8 16 142
## 9 0 4
table(wines$quality, wines$location)
##
## Califormia California Texas
## 3 2 0 23
## 4 2 33 146
## 5 10 450 1319
## 6 0 1939 458
## 7 0 878 41
## 8 0 83 75
## 9 0 4 0
wines$location <- gsub("Califormia", "California", wines$location, ignore.case=TRUE)
table(wines$quality, wines$location)
##
## California Texas
## 3 2 23
## 4 35 146
## 5 460 1319
## 6 1939 458
## 7 878 41
## 8 83 75
## 9 4 0
Mostly white wine shows higher quality, however we need to cautious with sample size difference.
# Assuming 'wines' is your data frame
wines_summary <- wines %>%
group_by(quality, type) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = count / sum(count)) # Optional: Calculate proportions
wines_summary %>%
ggplot(aes(x = quality, y = count, fill = type)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(label = count),
position = position_fill(vjust = 0.5), # Position the labels inside the bars
color = "white") +
ggtitle("A Barchart of Quality by Type") +
xlab("Quality") +
ylab("Proportion") +
theme_bw()
Mostly California has higher distribution of quality
wines_summary <- wines %>%
group_by(quality, location) %>%
summarise(count = n(), .groups = "drop") %>%
mutate(percentage = count / sum(count)) # Optional: Calculate proportions
wines_summary %>%
ggplot(aes(x = quality, y = count, fill = location)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(label = count),
position = position_fill(vjust = 0.5), # Position the labels inside the bars
color = "white") +
ggtitle("A Barchart of Quality by Location") +
xlab("Quality") +
ylab("Proportion") +
theme_bw()
excluded_columns <- c("ID","type", "location")
selected_data <- wines[, setdiff(names(wines), excluded_columns)]
str(selected_data)
## 'data.frame': 5463 obs. of 12 variables:
## $ fixed.acidity : num 7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
## $ volatile.acidity : num 0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
## $ citric.acid : num 0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
## $ residual.sugar : num 12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
## $ chlorides : num 0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
## $ free.sulfur.dioxide : num 7 31 39 22 33 30 9 23 15 47 ...
## $ total.sulfur.dioxide: num 41 201 166 141 132 114 27 97 151 194 ...
## $ density : num 0.994 0.996 0.997 0.993 0.991 ...
## $ pH : num 3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
## $ sulphates : num 0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
## $ alcohol : num 11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
## $ quality : Factor w/ 7 levels "3","4","5","6",..: 3 3 2 4 4 4 4 3 3 4 ...
selected_data$quality <- as.numeric(selected_data$quality)
# Compute correlation matrix for the selected columns
cor_matrix <- cor(selected_data)
print(cor_matrix)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 1.00000000 0.21823464 0.333199973 -0.11266700
## volatile.acidity 0.21823464 1.00000000 -0.373323484 -0.20136563
## citric.acid 0.33319997 -0.37332348 1.000000000 0.13228846
## residual.sugar -0.11266700 -0.20136563 0.132288464 1.00000000
## chlorides 0.29605993 0.37292022 0.051041175 -0.12640693
## free.sulfur.dioxide -0.28288825 -0.34540899 0.122127249 0.40675420
## total.sulfur.dioxide -0.33361204 -0.40382125 0.177334114 0.49715732
## density 0.46627359 0.26191157 0.097985259 0.54533710
## pH -0.25210981 0.25590178 -0.322531623 -0.27318560
## sulphates 0.30405382 0.22359740 0.069782595 -0.18947356
## alcohol -0.08897084 -0.03713716 -0.002550528 -0.36869902
## quality -0.07128477 -0.26144015 0.090534263 -0.03622339
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## fixed.acidity 0.29605993 -0.28288825 -0.33361204
## volatile.acidity 0.37292022 -0.34540899 -0.40382125
## citric.acid 0.05104118 0.12212725 0.17733411
## residual.sugar -0.12640693 0.40675420 0.49715732
## chlorides 1.00000000 -0.19023849 -0.27194647
## free.sulfur.dioxide -0.19023849 1.00000000 0.71900054
## total.sulfur.dioxide -0.27194647 0.71900054 1.00000000
## density 0.36474494 0.03235223 0.03253618
## pH 0.03443033 -0.14575035 -0.23875927
## sulphates 0.39911411 -0.18983600 -0.28530444
## alcohol -0.25496175 -0.18704256 -0.27434630
## quality -0.19869648 0.04148439 -0.05520419
## density pH sulphates alcohol
## fixed.acidity 0.466273587 -0.252109810 0.304053815 -0.088970837
## volatile.acidity 0.261911567 0.255901781 0.223597402 -0.037137156
## citric.acid 0.097985259 -0.322531623 0.069782595 -0.002550528
## residual.sugar 0.545337095 -0.273185595 -0.189473558 -0.368699015
## chlorides 0.364744942 0.034430326 0.399114115 -0.254961753
## free.sulfur.dioxide 0.032352229 -0.145750353 -0.189836005 -0.187042561
## total.sulfur.dioxide 0.032536183 -0.238759273 -0.285304438 -0.274346298
## density 1.000000000 0.002279624 0.262176353 -0.697516955
## pH 0.002279624 1.000000000 0.195875331 0.126339216
## sulphates 0.262176353 0.195875331 1.000000000 0.001861973
## alcohol -0.697516955 0.126339216 0.001861973 1.000000000
## quality -0.301996919 0.025142885 0.043687536 0.442294127
## quality
## fixed.acidity -0.07128477
## volatile.acidity -0.26144015
## citric.acid 0.09053426
## residual.sugar -0.03622339
## chlorides -0.19869648
## free.sulfur.dioxide 0.04148439
## total.sulfur.dioxide -0.05520419
## density -0.30199692
## pH 0.02514289
## sulphates 0.04368754
## alcohol 0.44229413
## quality 1.00000000
# Visualize correlation matrix
# Install and load the corrplot package
#install.packages("corrplot")
library(corrplot)
# Plot the correlation matrix
corrplot(
cor_matrix,
method = "circle",
addCoef.col = "black",
number.cex = 0.8, # Adjust text size for values
tl.col = "black", # Black color for labels
tl.cex = 0.8
)
# Fixed Acidity
# Wines from cool-climate grapes are usually high in acidity, while wines from warm-climate grapes can be low in acid.
# Red has higher fixed acidity in general. However, by location, California has higher red wine, and Texas has higher fixed acidity with white wine.
wines %>% ggplot(aes(x=type, y=fixed.acidity, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Fixed Acidity by Type", x = "Type", y="Fixed Acidity", color = "Location")
# Volatile Acidity
# Red has higher volatile acidity, and Texas has higher volatile acidity in both red and white
# Legal Limits: 1.2grams per liter
wines %>% ggplot(aes(x=type, y=volatile.acidity, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Volatile Acidity by Type", x = "Type", y="Volatile Acidity", color = "Location")
# Citric Acid
# California tends to have higher cTexas acidity in both red and white. The Texas red wine tends to have a lower distribution of citric acid.
# I see unusual outliers in California white wine.
# California tends to add more citric acid than Texas,
wines %>% ggplot(aes(x=type, y=citric.acid, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Citric Acid by Type", x = "Type", y="Citric Acid", color = "Location")
# Residual Sugar
# From natural grape sugars leftover
# Red wine has very low residual sugar, while white has higher residual sugar.
# California white wine shows some outliers
# White wine tends to have more natural grape sugars left over.
wines %>% ggplot(aes(x=type, y=residual.sugar, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Residual Sugar by Type", x = "Type", y="Residual Sugar", color = "Location")
# Chlorides
# White wine tends to have lower chlorides compared to red wine, and among them, texas has higher distribution of chlorides in both red and white
#red wine is slatier
# texas tends to have more chlorides. Texas wine is saltier
# Usually less than 500 mg/L
wines %>% ggplot(aes(x=type, y=chlorides, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Chlorides by Type", x = "Type", y="Chlorides", color = "Location")
# Free Sulfur Dioxide
# White Free Sulfur Dioxide is higher than red. Texas white wine has a broader distribution of free sulfur dioxide.
# There is a significant outlier in Texas for white-free sulfur dioxide.
wines %>% ggplot(aes(x=type, y=free.sulfur.dioxide, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Free Sulfur Dioxide by Type", x = "Type", y="Free Sulfur Dioxide", color = "Location")
# Total Sulfur Dioxide
# White wine has higher Total Sulfur Dioxide. Texas tends to have higher sulfur dioxides.
# White wine needs more preservatives to prevent oxidation and spoilage and to maintain freshness.
wines %>% ggplot(aes(x=type, y=total.sulfur.dioxide, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Total Sulfur Dioxide by Type", x = "Type", y="Total Sulfur Dioxide", color = "Location")
# Density
# Red wine has a higher density, and Texas wine has a higher density than California
#Red wine is generally considered denser than white wine because, during its production, the grape skins are left in contact with the juice during fermentation, which extracts tannins from the skins, resulting in a "fuller body" and denser texture compared to white wine where the skins are removed, leaving a lighter wine with less tannin content.
wines %>% ggplot(aes(x=type, y=density, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Density by Type", x = "Type", y="Density", color = "Location")
# pH
# Red wine has higher pH => meaning lower acidity. The location doesn't show much difference here.
wines %>% ggplot(aes(x=type, y=pH, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of pH by Type", x = "Type", y="pH", color = "Location")
# Sulphates
# Red wine has higher sulfates, and California has higher sulfates in both red and white wines.
# Wine with higher acidity requires less sulfates than wine with lower acidity
# Wines with more sugar need more sulfites to prevent secondary fermentation.
wines %>% ggplot(aes(x=type, y=sulphates, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Sulphates by Type", x = "Type", y="Sulphates", color = "Location")
# Alcohol
# high sugar => high alcohol
# Cooler climates make it more challenging for grapes to ripen, so the fruit is often harvested with lower sugar levels. => low alcohol level
# Warmer climates allow grapes to ripen longer on the vine, producing higher alcohol levels.
# California has higher alcohol!
wines %>% ggplot(aes(x=type, y=alcohol, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Alcohol by Type", x = "Type", y="Alcohol", color = "Location")
plot_ly(data = wines,
x = ~density,
y = ~total.sulfur.dioxide,
z = ~volatile.acidity,
type = "scatter3d",
mode = "markers",
color = ~type, # Use color for location
colors = c("#C8102E", "#0033A0"), # Specify desired colors
#symbol = ~type, # Use shape for type (red/white)
marker = list(size = 4, opacity = 0.7)) %>%
layout(title = "3D Scatterplot: Type of Wine by Density, Total SO2, and Volatile Acidity",
scene = list(xaxis = list(title = "Density"),
yaxis = list(title = "Total SO2"),
zaxis = list(title = "Volatile Acidity")),
legend = list(title = list(text = "Legend")))
#Density: The highest point looks like outliers. => more investigation needed.
wines %>% ggplot(aes(x=type, y=density, fill=type)) + geom_boxplot() + ggtitle("A boxplot of density by type" ) +xlab("Type") + ylab("Density") + theme_bw()
#SO2: The Legal limit is known as 350. Hence the outlier we see in white wine, we should be cautious.
wines %>% ggplot(aes(x=type, y=total.sulfur.dioxide, fill=type)) + geom_boxplot() + ggtitle("A boxplot of total SO2 by type" ) +xlab("Type") + ylab("Total SO2") + theme_bw()
# Volatile Acidity: There's an outlier around 1.6 in red wine. Since red wine's legal limit is 1.4mg/L, we should consider whether we still want this outlier or not.
wines %>% ggplot(aes(x=type, y=volatile.acidity, fill=type)) + geom_boxplot() + ggtitle("A boxplot of volatile acidity by type" ) +xlab("Type") + ylab("Volatile Acidity") + theme_bw()
As per domain knowledge, the high density will have high residual sugar. According to the outlier point, it has a high residual point, which was an influential point in residual sugar.
subset_wines <- wines %>% filter(density > 1.01)
print(subset_wines)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 3152 7.9 0.33 0.28 31.6 0.053
## 2 5151 7.9 0.33 0.28 31.6 0.053
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 35 176 1.0103 3.15 0.38 8.8 white
## 2 35 176 1.0103 3.15 0.38 8.8 white
## location quality
## 1 California 6
## 2 California 6
wines %>% ggplot(aes(x=type, y=residual.sugar, fill=type)) + geom_boxplot() + ggtitle("A boxplot of residual sugar by type" ) +xlab("Type") + ylab("Residual Sugar") + theme_bw()
wines %>% ggplot(aes(x=type, y=free.sulfur.dioxide, fill=type)) + geom_boxplot() + ggtitle("A boxplot of residual sugar by type" ) +xlab("Type") + ylab("Residual Sugar") + theme_bw()
# Positive relationship
wines %>% ggplot(aes(x = density , y=residual.sugar, color=type)) + geom_point() + ggtitle("A scatterplot between Density and Residual Sugar") + xlab("Density") + ylab("Residual Sugar")
#Quality
ggplot(wines, aes(x = quality)) +
geom_bar(fill = "royalblue") +
labs(title = "Distribution of Wine Quality", x = "Quality", y = "Count")
# Create a pairplot for quality, type, and location
ggpairs(wines,
columns = c("quality", "type", "location"),
mapping = aes(color = type),
lower = list(continuous = wrap("points", alpha = 0.6)),
upper = list(continuous = wrap("cor", size = 3))) +
labs(title = "Wine Quality by Location and Type")
ggpairs(
wines,
columns = c('fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar', 'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol','quality'),
aes(color = location, alpha = .8),
title = "Pairwise Relationships with Quality as Color",
lower = list(continuous = "smooth"),
upper = list(continuous = "cor")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Calculate mean quality and count by location
summary_table <- wines %>%
group_by(location) %>%
summarise(
count = n(),
mean_quality = mean(quality, na.rm = TRUE)
)
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `mean_quality = mean(quality, na.rm = TRUE)`.
## ℹ In group 1: `location = "California"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Display the table using kable
kable(summary_table, format = "pipe", col.names = c("Location", "Count", "Mean Quality"))
| Location | Count | Mean Quality |
|---|---|---|
| California | 3401 | NA |
| Texas | 2062 | NA |
ggplot(wines, aes(x = as.factor(quality), fill = location)) +
geom_bar(position = "fill", color = "black", width = 0.7) + # Bar border color and width
geom_text(stat = "count", aes(label = ..count..),
position = position_fill(vjust = 0.5), size = 3, color = "white", fontface = "bold") + # Text styling
labs(
title = "Quality by Location: California vs. Texas",
x = "Wine Quality",
y = "Proportion",
fill = "Location"
) +
scale_fill_manual(values = c("California" = "#0033A0", "Texas" = "#C8102E")) + # Custom colors for locations
theme_minimal() + # Use a minimal theme
theme(
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), # Title styling
axis.title = element_text(size = 12), # Axis title size
axis.text = element_text(size = 10), # Axis text size
legend.position = "top" # Move the legend to the top
)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(wines, aes(x = alcohol, y = volatile.acidity, color = location)) +
geom_point(alpha = .6) +
labs(
title = "Scatterplot of Factor 1 vs. Factor 2",
x = "Factor 1",
y = "Factor 2",
color = "Location"
) +
theme_solarized()
plot_ly(data = wines,
x = ~alcohol,
y = ~density,
z = ~volatile.acidity,
type = "scatter3d",
mode = "markers",
color = ~location,
colors = c("#0033A0", "#C8102E"),
marker = list(size = 4, opacity = 0.5)) |>
layout(title = "3D Scatterplot: Location of Wine by Alcohol, Density, and Volatile Acidity",
scene = list(xaxis = list(title = "Alcohol"),
yaxis = list(title = "Density"),
zaxis = list(title = "Volatile Acidity")),
legend = list(title = list(text = "Legend")))
# create a 3D scatterplot for quality
plot_ly(data = wines,
x = ~volatile.acidity,
y = ~density,
z = ~alcohol,
type = "scatter3d",
mode = "markers",
color = ~as.factor(quality),
colors = c("#FF0000", "#FF5733", "#FFFF33","#33FF57","#33FFF6","#0099ff", "#9900ff"),
marker = list(size = 4, opacity = 0.8)) |>
layout(title = "3D Scatterplot: Location of Wine by Volatile Acidity, Density, and Alcohol",
scene = list(xaxis = list(title = "Volatile Acidity"),
yaxis = list(title = "Density"),
zaxis = list(title = "Alcohol")),
legend = list(title = list(text = "Legend")))
# Encode 'location' and 'type' as a numeric values
wines$type <- as.numeric(as.factor(wines$type))
wines$location <- as.numeric(as.factor(wines$location))
numeric_columns <- wines[, sapply(wines, is.numeric)]
# Compute the correlation matrix
cor_matrix <- cor(numeric_columns, use = "complete.obs")
# Visualize the correlation matrix with numbers
library(corrplot)
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
title = "Correlation Matrix Including Location and Type", mar = c(0, 0, 1, 0),
addCoef.col = "black",
number.cex = 0.7,
col = colorRampPalette(c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC"))(200))